{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基于Transformers的命名实体识别"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step1 导入相关包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2 加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果可以联网，直接使用load_dataset进行加载\n",
    "#ner_datasets = load_dataset(\"peoples_daily_ner\", cache_dir=\"./data\")\n",
    "# 如果无法联网，则使用下面的方式加载数据集\n",
    "from datasets import DatasetDict\n",
    "ner_datasets = DatasetDict.load_from_disk(\"ner_data\")\n",
    "ner_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ner_datasets[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ner_datasets[\"train\"].features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_list = ner_datasets[\"train\"].features[\"ner_tags\"].feature.names\n",
    "label_list"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step3 数据集预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"hfl/chinese-macbert-base\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer(ner_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)   # 对于已经做好tokenize的数据，要指定is_split_into_words参数为True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = tokenizer(\"interesting word\")\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res.word_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 借助word_ids 实现标签映射\n",
    "def process_function(examples):\n",
    "    tokenized_exmaples = tokenizer(examples[\"tokens\"], max_length=128, truncation=True, is_split_into_words=True)\n",
    "    labels = []\n",
    "    for i, label in enumerate(examples[\"ner_tags\"]):\n",
    "        word_ids = tokenized_exmaples.word_ids(batch_index=i)\n",
    "        label_ids = []\n",
    "        for word_id in word_ids:\n",
    "            if word_id is None:\n",
    "                label_ids.append(-100)\n",
    "            else:\n",
    "                label_ids.append(label[word_id])\n",
    "        labels.append(label_ids)\n",
    "    tokenized_exmaples[\"labels\"] = labels\n",
    "    return tokenized_exmaples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets = ner_datasets.map(process_function, batched=True)\n",
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(tokenized_datasets[\"train\"][0])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step4 创建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对于所有的非二分类任务，切记要指定num_labels，否则就会device错误\n",
    "model = AutoModelForTokenClassification.from_pretrained(\"hfl/chinese-macbert-base\", num_labels=len(label_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.config.num_labels"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step5 创建评估函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里方便大家加载，替换成了本地的加载方式，无需额外下载\n",
    "seqeval = evaluate.load(\"seqeval_metric.py\")\n",
    "seqeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def eval_metric(pred):\n",
    "    predictions, labels = pred\n",
    "    predictions = np.argmax(predictions, axis=-1)\n",
    "\n",
    "    # 将id转换为原始的字符串类型的标签\n",
    "    true_predictions = [\n",
    "        [label_list[p] for p, l in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels) \n",
    "    ]\n",
    "\n",
    "    true_labels = [\n",
    "        [label_list[l] for p, l in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels) \n",
    "    ]\n",
    "\n",
    "    result = seqeval.compute(predictions=true_predictions, references=true_labels, mode=\"strict\", scheme=\"IOB2\")\n",
    "\n",
    "    return {\n",
    "        \"f1\": result[\"overall_f1\"]\n",
    "    }\n",
    "    "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step6 配置训练参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir=\"models_for_ner\",\n",
    "    per_device_train_batch_size=64,\n",
    "    per_device_eval_batch_size=128,\n",
    "    eval_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    metric_for_best_model=\"f1\",\n",
    "    load_best_model_at_end=True,\n",
    "    logging_steps=50,\n",
    "    num_train_epochs=1\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step7 创建训练器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    tokenizer=tokenizer,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    compute_metrics=eval_metric,\n",
    "    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step8 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.evaluate(eval_dataset=tokenized_datasets[\"test\"])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step9 模型预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用pipeline进行推理，要指定id2label\n",
    "model.config.id2label = {idx: label for idx, label in enumerate(label_list)}\n",
    "model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果模型是基于GPU训练的，那么推理时要指定device\n",
    "# 对于NER任务，可以指定aggregation_strategy为simple，得到具体的实体的结果，而不是token的结果\n",
    "ner_pipe = pipeline(\"token-classification\", model=model, tokenizer=tokenizer, device=0, aggregation_strategy=\"simple\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = ner_pipe(\"小明在北京上班\")\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据start和end取实际的结果\n",
    "ner_result = {}\n",
    "x = \"小明在北京上班\"\n",
    "for r in res:\n",
    "    if r[\"entity_group\"] not in ner_result:\n",
    "        ner_result[r[\"entity_group\"]] = []\n",
    "    ner_result[r[\"entity_group\"]].append(x[r[\"start\"]: r[\"end\"]])\n",
    "\n",
    "ner_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transformers",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
